{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "68909c78",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sourab/transformers/src/transformers/models/auto/tokenization_auto.py:640: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"tiiuae/falcon-180B-chat\", use_auth_token=True)\n",
    "eos_token = tokenizer.eos_token\n",
    "print(eos_token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "ea9e91c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You are an AI assistant that follows instruction extremely well. Help as much as you can.\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "\"\"\"\n",
    "We will combine following datasets for following instructions and converse :\n",
    "1. timdettmers/openassistant-guanaco: 100%\n",
    "2. GAIR/lima: 100%\n",
    "3. garage-bAInd/Open-Platypus: 100%\n",
    "4. Open-Orca/OpenOrca: 10k of GPT4 split\n",
    "5. ehartford/dolphin: 10k of GPT4 split\n",
    "6. stingning/ultrachat: 10k\n",
    "7. jondurbin/airoboros-2.2: 10k while **filtering out** samples with `skip_prompt_formatting==True`\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "\"\"\"\n",
    "Format of the dataset:\n",
    "\n",
    "<|system|> system message <|endoftext|> <|prompter|> Q1 <|endoftext|> <|assistant|> A1 <|endoftext|>\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "system_prompt = \"\"\"You are an AI assistant that follows instruction extremely well. Help as much as you can.\"\"\"\n",
    "\n",
    "print(system_prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3beab4cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f3809f7b6a77420497b85f23dfa30f9d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1030 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f64382e937234352b777743ff062a985",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/300 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 1030\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 300\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Are we alone in the universe? <|endoftext|> <|assistant|> Humanity has yet to find evidence for life outside planet Earth.\\n\\nThe current search for extraterrestrial life is largely focused on finding planets that are situated in an \"habitable zone\". Roughly put, a planet is in a habitable zone if its distance from nearby stars allows for liquid water to persist on its surface.\\nSo far, a few such planets have been found, but none yet with liquid water on its surface. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# LIMA dataset processing\n",
    "\n",
    "def preprocess(samples):\n",
    "    conv_prefix = f\"<|system|> {system_prompt} {eos_token}\"\n",
    "    batch = []\n",
    "    for sample in samples[\"conversations\"]:\n",
    "        formatted_conv = conv_prefix\n",
    "        for i, turn in enumerate(sample):\n",
    "            turn_prefix = \"<|assistant|>\" if (i+1)%2==0 else \"<|prompter|>\"\n",
    "            formatted_conv += f\" {turn_prefix} {turn} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "            \n",
    "    \n",
    "\n",
    "\n",
    "lima = load_dataset(\"GAIR/lima\")\n",
    "lima = lima.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=lima[\"train\"].column_names\n",
    ")\n",
    "\n",
    "lima[\"train\"] = lima[\"train\"].shuffle(100)\n",
    "\n",
    "print(lima)\n",
    "print(lima[\"train\"][0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e7529570",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a244c14770f340a49783a89cdebbb349",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Repo card metadata block was not found. Setting CardData to empty.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b4e7eb854774e83995cf3cd9b2ecd51",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bb192e01b2484795adf3004c54cca4a7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d137053a0de14bfd911e8b9153e5ec59",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ec9fa93b89e942aea86e8b9566fa439a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "596e122a202c433883daa2cb399693ad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0d6b5c411a4042b483439b60603031e6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7823633116484f8a811a53d6685279bf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/9846 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "caed6ec21afd498daf4bcc14b4cc49b4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/518 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 9846\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 518\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> En un futuro no muy lejano, la humanidad ha descubierto la forma de crear robots con personalidades únicas y sentimientos. Un día, en un pequeño taller en la ciudad, un grupo de robots decide hacer una fiesta para celebrar su amistad. Mientras preparan la fiesta, discuten sobre qué tipo de comida deben preparar y qué juegos deben jugar. ¿Qué decisión tomarán y cómo se desarrollará la fiesta? ¡Sé creativo y haz que la fiesta sea divertida y llena de risas! <|endoftext|> <|assistant|> Mientras se desarrolla la fiesta, los robots se dan cuenta que el lugar no es muy grande y por ello deciden jugar juegos de mesa, uno de ellos, quien es el más precavido, trae consigo un juego de cartas.  Los robots comienzan a jugar y alguno de ellos les comparte una rica botana de tornillos y clavos, aderezados con un líquido viscoso que ayuda a aceitarlos y mantenerlos en buena forma. Al final deciden girar una botella y al robot que apunte, deberá bailar frente a los demás. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# OpenAsst Guanaco dataset processing\n",
    "\n",
    "tokens = [\"### Human:\", \"### Assistant:\"]\n",
    "\n",
    "import re\n",
    "\n",
    "def split_on_multiple_tokens(input_string, tokens):\n",
    "    # Combine the tokens into a regular expression pattern using the '|' (OR) operator\n",
    "    pattern = '|'.join(re.escape(token) for token in tokens)\n",
    "    \n",
    "    # Split the input string using the generated pattern\n",
    "    split_result = re.split(pattern, input_string)\n",
    "    \n",
    "    # Remove any empty strings resulting from consecutive delimiters\n",
    "    split_result = [part.strip() for part in split_result if part.strip()]\n",
    "    \n",
    "    return split_result\n",
    "\n",
    "def preprocess(samples):\n",
    "    conv_prefix = f\"<|system|> {system_prompt} {eos_token}\"\n",
    "    batch = []\n",
    "    for sample in samples[\"text\"]:\n",
    "        sample = split_on_multiple_tokens(sample, tokens)\n",
    "        formatted_conv = conv_prefix\n",
    "        for i, turn in enumerate(sample):\n",
    "            turn_prefix = \"<|assistant|>\" if (i+1)%2==0 else \"<|prompter|>\"\n",
    "            formatted_conv += f\" {turn_prefix} {turn} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "            \n",
    "    \n",
    "\n",
    "\n",
    "guanaco = load_dataset(\"timdettmers/openassistant-guanaco\")\n",
    "guanaco = guanaco.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=guanaco[\"train\"].column_names\n",
    ")\n",
    "\n",
    "guanaco[\"train\"] = guanaco[\"train\"].shuffle()\n",
    "\n",
    "print(guanaco)\n",
    "print(guanaco[\"train\"][0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "2a6a0f99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 22426\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 2500\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Which is the smoothest?\\nA: asphalt road\\nB: rock wall\\nC: ceramic mug <|endoftext|> <|assistant|> Smooth is a property. A smooth material is not rough or bumpy.\\nLook at each picture, one at a time. Imagine touching the material shown in each picture.\\nOf the choices, the ceramic mug is the smoothest. If you touch a piece of ceramic like this one, it will not feel rough. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# Open Playtpus\n",
    "def preprocess(samples):\n",
    "    conv_prefix = f\"<|system|> {system_prompt} {eos_token}\"\n",
    "    batch = []\n",
    "    for instruction, input_text, output_text in zip(samples[\"instruction\"], samples[\"input\"], samples[\"output\"]):\n",
    "        formatted_conv = conv_prefix\n",
    "        formatted_conv += f\" <|prompter|> {instruction} {input_text} {eos_token}\" if len(input_text)>0 else f\" <|prompter|> {instruction} {eos_token}\"\n",
    "        formatted_conv += f\" <|assistant|> {output_text} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "            \n",
    "    \n",
    "\n",
    "\n",
    "platypus = load_dataset(\"garage-bAInd/Open-Platypus\")\n",
    "platypus = platypus.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=platypus[\"train\"].column_names\n",
    ")\n",
    "\n",
    "platypus[\"train\"] = platypus[\"train\"].shuffle()\n",
    "platypus = platypus[\"train\"].train_test_split(2500)\n",
    "print(platypus)\n",
    "print(platypus[\"train\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "34c49b5a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9dbe4e445841415d9d45a76b5efbdb11",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d998e2411864468bba09e071b06ff30d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/5000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 10000\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 5000\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Please answer the following question: Answer the question based on the following text.  Question:   Jan poured a liquid solution into a bottle and the solution was not dense, so in the bottle it took up more room or less room?    Text:  More dense water takes up less space than less dense water.\\nA: <|endoftext|> <|assistant|> In the bottle, the solution took up more room as it was not dense. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# Dolphin\n",
    "\n",
    "dolphin = load_dataset(\"ehartford/dolphin\",data_files=\"flan1m-alpaca-uncensored.jsonl\")\n",
    "dolphin = dolphin[\"train\"].shuffle()\n",
    "dolphin_subset = dolphin.train_test_split(10000)\n",
    "\n",
    "test = dolphin_subset[\"train\"].train_test_split(5000)\n",
    "dolphin_subset[\"train\"] = dolphin_subset[\"test\"]\n",
    "dolphin_subset[\"test\"] = test[\"test\"]\n",
    "\n",
    "\n",
    "def preprocess(samples):\n",
    "    batch = []\n",
    "    for instruction, input_text, output_text in zip(samples[\"instruction\"], samples[\"input\"], samples[\"output\"]):\n",
    "        conv_prefix = f\"<|system|> {instruction} {eos_token}\"\n",
    "        formatted_conv = conv_prefix\n",
    "        formatted_conv += f\" <|prompter|> {input_text} {eos_token}\"\n",
    "        formatted_conv += f\" <|assistant|> {output_text} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "\n",
    "dolphin_subset = dolphin_subset.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=dolphin_subset[\"train\"].column_names\n",
    ")\n",
    "\n",
    "\n",
    "print(dolphin_subset)\n",
    "print(dolphin_subset[\"train\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "e1916702",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "22fad3ddd9c640ed96afb086062ceaa3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "aaad675d6d9b433fab77e39e471189ae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/5000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 10000\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 5000\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps. <|endoftext|> <|prompter|> What\\'s the most logical way to complete this passage?\\n\\nBy Beth Stebner PUBLISHED: 14:24 EST, 18 March 2012 | UPDATED: 05:42 EST, 20 March 2012 For the second time during the primaries, Republican presidential hopeful Rick Santorum has made a blunder about Puerto Rico. On the day Puerto Ricans head to the polls to cast their votes for the nation\\'s next president, Mr Santorum referred to the archipelago as a ‘Spanish-speaking country.’ In fact, Puerto Rico is not a country at all, but an unincorporated territory of the United States. Last week, the former Pennsylvania senator told a Puerto Rican newspaper that the territory’s admittance as the America’s 51st state was contingent on its residents speaking English as its official language.\\n\\nOK by me in <|endoftext|> <|assistant|> To complete the passage logically, I\\'ll go step by step considering the context and information provided.\\n\\nStep 1: Identify the issue that needs to be addressed\\nThe passage discusses Rick Santorum\\'s blunders about Puerto Rico during the primaries.\\n\\nStep 2: Provide clarification or context to the blunders\\nThe passage has already detailed the two blunders made by Santorum: \\n1. Referring to Puerto Rico as a \"Spanish-speaking country.\"\\n2. Saying that the territory\\'s admittance as the U.S.\\'s 51st state is contingent on residents speaking English as its official language.\\n\\nStep 3: Discuss the consequences or reactions to the blunders\\nSince the passage is about Santorum\\'s mistakes, it is logical to mention any consequences or reactions as a result of these blunders.\\n\\nStep 4: Conclude the passage\\nEnd the passage by wrapping up the main points or noting any further developments in the situation.\\n\\nBased on the previous steps, the most logical way to complete the passage would be:\\n\\nThese mistakes have caused backlash from Puerto Ricans, who feel their status and language have been misrepresented by Mr. Santorum. Critics argue that his comments reflect a lack of understanding about the relationship between the United States and Puerto Rico, as well as the importance of preserving the island\\'s Spanish-speaking heritage. Furthermore, they point out that the U.S. does not have an official national language, making Santorum\\'s statement about English as a prerequisite for statehood irrelevant. Moving forward, it remains to be seen whether these blunders will have any lasting impact on Mr. Santorum\\'s presidential campaign or influence voters\\' opinions on his qualifications for office. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# Open Orca\n",
    "orca = load_dataset(\"Open-Orca/OpenOrca\",data_files=\"1M-GPT4-Augmented.parquet\")\n",
    "orca = orca[\"train\"].shuffle()\n",
    "orca_subset = orca.train_test_split(10000)\n",
    "\n",
    "test = orca_subset[\"train\"].train_test_split(5000)\n",
    "orca_subset[\"train\"] = orca_subset[\"test\"]\n",
    "orca_subset[\"test\"] = test[\"test\"]\n",
    "\n",
    "\n",
    "def preprocess(samples):\n",
    "    batch = []\n",
    "    for instruction, input_text, output_text in zip(samples[\"system_prompt\"], samples[\"question\"], samples[\"response\"]):\n",
    "        conv_prefix = f\"<|system|> {instruction} {eos_token}\"\n",
    "        formatted_conv = conv_prefix\n",
    "        formatted_conv += f\" <|prompter|> {input_text} {eos_token}\"\n",
    "        formatted_conv += f\" <|assistant|> {output_text} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "\n",
    "orca_subset = orca_subset.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=orca_subset[\"train\"].column_names\n",
    ")\n",
    "\n",
    "\n",
    "print(orca_subset)\n",
    "print(orca_subset[\"train\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "f5ab6ea7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "168ed4b68b1141a386ea9c4fe1472961",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c517d915e179438cbd6885e22629da77",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/5000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 10000\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 5000\n",
      "    })\n",
      "})\n",
      "{'content': \"<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Are there any markets in Thessaloniki that offer discounts or special deals on certain products? <|endoftext|> <|assistant|> As an AI language model, I don't have access to current information. However, several markets in Thessaloniki may offer discounts or special deals on certain products during seasonal sales or promotional events. Some popular markets in Thessaloniki include Modiano Market, Kapani Market, and the open-air market on Aristotelous Square. Visitors can check with local shops and vendors, look for signs advertising sales, or ask about discounts when making purchases. <|endoftext|> <|prompter|> Can you provide me with some tips on how to bargain for better prices in Thessaloniki markets? <|endoftext|> <|assistant|> Sure! Here are some tips for bargaining in Thessaloniki markets:\\n\\n1. Start with a friendly and polite conversation with the seller. Get to know them a little bit and try to establish a good rapport.\\n\\n2. Don't be the first to suggest a price. Ask the seller for the price, and then try to negotiate from there.\\n\\n3. Research the prices of similar products in other shops and try to use that information to get a better deal.\\n\\n4. Ask for a discount if you buy multiple items or if you pay in cash.\\n\\n5. Remember that bargaining is a common practice in markets, but don't be too aggressive or disrespectful.\\n\\n6. Be willing to walk away if the price is still too high. Often, sellers will lower the price if they think they're about to lose a potential sale.\\n\\n7. Don't forget to smile and be respectful throughout the negotiation process. A little bit of charm can go a long way! <|endoftext|> <|prompter|> I heard that some vendors in Thessaloniki markets may inflate prices for tourists. How can I avoid falling for this trick? <|endoftext|> <|assistant|> Unfortunately, it's true that some vendors in Thessaloniki markets may inflate prices for tourists. Here are some tips to avoid falling for this trick:\\n\\n1. Do some research ahead of time to get a general idea of the prices for the items you're interested in.\\n\\n2. Try to blend in with the locals. Dress and act like you belong in the area, and don't immediately reveal that you're a tourist.\\n\\n3. Be wary of deals that seem too good to be true. If a price seems suspiciously low or too high, do some more research before making a purchase.\\n\\n4. Don't hesitate to negotiate. If you think a price is too high, try bargaining with the vendor to bring it down to a more reasonable level.\\n\\n5. Be aware of your body language. Don't show too much excitement for an item, as this can signal to the vendor that you're willing to pay more.\\n\\n6. Always ask for a receipt, and make sure the price on the receipt matches the price you agreed upon.\\n\\n7. Finally, remember that it's okay to walk away if you're not comfortable with the price or the vendor's behavior. There are likely other vendors selling similar items in the area, so don't feel pressured to make a purchase. <|endoftext|>\"}\n"
     ]
    }
   ],
   "source": [
    "# Ultra Chat\n",
    "\n",
    "def preprocess(samples):\n",
    "    conv_prefix = f\"<|system|> {system_prompt} {eos_token}\"\n",
    "    batch = []\n",
    "    for sample in samples[\"data\"]:\n",
    "        formatted_conv = conv_prefix\n",
    "        for i, turn in enumerate(sample):\n",
    "            turn_prefix = \"<|assistant|>\" if (i+1)%2==0 else \"<|prompter|>\"\n",
    "            formatted_conv += f\" {turn_prefix} {turn} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "\n",
    "ultrachat = load_dataset(\"stingning/ultrachat\")\n",
    "ultrachat = ultrachat[\"train\"].shuffle()\n",
    "ultrachat_subset = ultrachat.train_test_split(10000)\n",
    "\n",
    "test = ultrachat_subset[\"train\"].train_test_split(5000)\n",
    "ultrachat_subset[\"train\"] = ultrachat_subset[\"test\"]\n",
    "ultrachat_subset[\"test\"] = test[\"test\"]\n",
    "\n",
    "ultrachat_subset = ultrachat_subset.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=ultrachat_subset[\"train\"].column_names\n",
    ")\n",
    "\n",
    "\n",
    "print(ultrachat_subset)\n",
    "print(ultrachat_subset[\"train\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "d43fa495",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "168762c4f5b64ae6b345f64339f695e6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Filter:   0%|          | 0/80677 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "78e75153ef8343289d17d766ad413dc7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ceadf33240c3409cb4b548b53e3934ca",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/5000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 10000\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['content'],\n",
      "        num_rows: 5000\n",
      "    })\n",
      "})\n",
      "{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> In virology, which of the following statements is true about the Baltimore classification system?\\nA. It classifies viruses into seven groups based on their type of genome and method of replication.\\nB. It classifies viruses based on their shape and size.\\nC. It classifies viruses based on their host organism.\\nD. It classifies viruses based on their pathogenicity.\\nE. None of the above. <|endoftext|> <|assistant|> A. It classifies viruses into seven groups based on their type of genome and method of replication. <|endoftext|>'}\n"
     ]
    }
   ],
   "source": [
    "# Airboros 2.2\n",
    "\n",
    "airboros = load_dataset(\"jondurbin/airoboros-2.2\")\n",
    "airboros = airboros[\"train\"].shuffle()\n",
    "airboros = airboros.filter(lambda example: example['skip_prompt_formatting']==False)\n",
    "airboros_subset = airboros.train_test_split(10000)\n",
    "\n",
    "test = airboros_subset[\"train\"].train_test_split(5000)\n",
    "airboros_subset[\"train\"] = airboros_subset[\"test\"]\n",
    "airboros_subset[\"test\"] = test[\"test\"]\n",
    "airboros_subset\n",
    "\n",
    "def preprocess(samples):\n",
    "    batch = []\n",
    "    for instruction, input_text, output_text in zip(samples[\"system\"], samples[\"instruction\"], samples[\"response\"]):\n",
    "        instruction = system_prompt if \"A chat.\" in instruction else instruction\n",
    "        conv_prefix = f\"<|system|> {instruction} {eos_token}\"\n",
    "        formatted_conv = conv_prefix\n",
    "        formatted_conv += f\" <|prompter|> {input_text} {eos_token}\"\n",
    "        formatted_conv += f\" <|assistant|> {output_text} {eos_token}\"\n",
    "        batch.append(formatted_conv)\n",
    "    return {\"content\": batch}\n",
    "\n",
    "airboros_subset = airboros_subset.map(\n",
    "    preprocess,\n",
    "    batched=True,\n",
    "    remove_columns=airboros_subset[\"train\"].column_names\n",
    ")\n",
    "\n",
    "\n",
    "print(airboros_subset)\n",
    "print(airboros_subset[\"train\"][0])\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "31f5c4a8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['content'],\n",
       "        num_rows: 73302\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['content'],\n",
       "        num_rows: 23318\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import concatenate_datasets, DatasetDict\n",
    "full_dataset = DatasetDict({split: concatenate_datasets([lima[split], \n",
    "                                                         guanaco[split], \n",
    "                                                         platypus[split], \n",
    "                                                         dolphin_subset[split],\n",
    "                                                         orca_subset[split],\n",
    "                                                         ultrachat_subset[split],\n",
    "                                                         airboros_subset[split], \n",
    "                                                        ]) for split in [\"train\", \"test\"]})\n",
    "full_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "1b1c962d",
   "metadata": {},
   "outputs": [],
   "source": [
    "full_dataset[\"train\"] = full_dataset[\"train\"].shuffle()\n",
    "full_dataset[\"test\"] = full_dataset[\"test\"].shuffle()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "4f960de1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9b191938cbc24505ad46e2de0cdc6fd2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4c58ff00dac249a7aea07ddca40df746",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "583e8c9c9e9a429f80e8e016f8575fdb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "875c3691971b4c22af10b26eb39b6f29",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "878716141c6a493584c8f31b2e06420d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "72a69a78a9bc45f18c8481acd125d62c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ca00661a445047eb8b03de4a63c3110f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading metadata:   0%|          | 0.00/587 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_dataset.push_to_hub(\"chat-instruct-mixer\", private=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a734789e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
